About

In this markdown, we will do a comparison of the transcriptomes during development of Ptychodera and Branchiostoma floridae.

For this, we will use a solution of custom R functions and wrappers that we have named “comparABle”.

Load libraries

library(tidyverse)
library(stringr)
library(BiocGenerics)
library(EDASeq)
library(DESeq2)
library(tximport)
library(limma)
library(ComplexHeatmap)
library(circlize)
library(RColorBrewer)
require(data.table)
require(stringr)
require(tidytree)
require(ggtree)
require(rvcheck)
require(treeio)
require(dendextend)
require(phangorn)
require(phytools)
require(ComplexHeatmap)
require(foreach)
require(doParallel)
require(colorspace)
library(topGO)

Load functions

source("code/r_code/r_functions/sourcefolder.R")

sourceFolder(
  "code/r_code/r_functions",
  recursive = TRUE
  )

sourceFolder(
  "code/r_code/r_general",
  recursive = TRUE
  )

Load Data

We will first load the data of our species:

# Ptychodera analyses
load("outputs/rda/deseq2.rda")
# Amphioxus reanalysis
load("outputs/rda/bflo_reanalysis.rda")

For more functional annotation, we will use the gene age, COG functional categories, and the GO terms of Ptychodera.

load("outputs/rda/geneage.rda")

pfla_cogs <- read.table(
  "outputs/functional_annotation/COGs/pfla_cogs.tsv",
  col.names = c("id","cog")
)

# GO terms
pfla_id2go <- 
  readMappings(
    "outputs/functional_annotation/go_blast2go/GO_annotation.txt"
  )

Alternative method for 1-to-1 orthologs: Blast best reciprocal hits

Comparison of 1-to-1 TFs using blast best RBHs

rbh_pfla_bflo <- read.delim2("outputs/comparative/rbh/pfla_bflo_rbh.tsv", header = FALSE)
rbh_pfla_bflo[,2] <- gsub(" ","",rbh_pfla_bflo[,2])
rbh_pfla_bflo[,1] <- gsub("\\.p[0-9]+","",rbh_pfla_bflo[,1])

Ptychodera and Amphioxus

# Expression data, using all genes. No need for transformation
a = pfla_rna_counts
b = bflo_counts
a_samples = levels(condition_x)
b_samples = unique(sub("_.$", "", colnames(b)))

# Family/orthology data
o = unique(rbh_pfla_bflo)

# GOs
a_universe = rownames(vsd_allgen)
a_id2go = pfla_id2go
# Tidy up
samples_a = a_samples
a = qnorm(a)
## Loading required package: preprocessCore
a = tidyup(a, highlyvariable = FALSE) # remove genes with 0 tpms
a = rep2means(samples_a,a)

samples_b = b_samples
b = qnorm(b)
b = tidyup(b, highlyvariable = FALSE)
b = rep2means(samples_b,b) # remove genes with 0 tpms

o = pair2id(o)

# MERGE
merge_ab <- mergedata(a,b,o)

# CORRELATIONS
cors <- rawcorsp(merge_ab$a_o,merge_ab$b_o) # FIX JSD

# CORRELATIONS
plot_cors(cors)

Highly correlated genes

#Common genes in cors
ab_common_genes_cor <- get_high_cor_genes(
  mat = cors$js,
  a_o = merge_ab$a_o,
  b_o = merge_ab$b_o,
  o = o
)
## Loading required package: ggpointdensity
## The top five similar pair of stages are: 24.96003 25.22076 25.28024 25.57554 25.62751 
## [1] "Subsetting count matrices"
## [1] "Model fitting and top genes"
## Loading required package: edgeR
#Common genes in Correlations (GO)
ab_common_genes_cor_GOs <- 
  getGOs(
    genelist = 
      lapply(
        ab_common_genes_cor$hicor_topgenes, 
        function(sub_list) {
          setNames(sub_list$top_genes$a, names(sub_list))
          }),
    gene_universe = a_universe,
    gene2GO = a_id2go
  )
## [1] "Starting analysis 1 of 5"
## 
## Building most specific GOs .....
##  ( 15540 GO terms found. )
## 
## Build GO DAG topology ..........
##  ( 15584 GO terms and 36018 relations. )
## 
## Annotating nodes ...............
##  ( 15464 genes annotated to the GO terms. )
## 
##           -- Classic Algorithm -- 
## 
##       the algorithm is scoring 1141 nontrivial nodes
##       parameters: 
##           test statistic: fisher
## [1] "Starting analysis 2 of 5"
## 
## Building most specific GOs .....
##  ( 15540 GO terms found. )
## 
## Build GO DAG topology ..........
##  ( 15584 GO terms and 36018 relations. )
## 
## Annotating nodes ...............
##  ( 15464 genes annotated to the GO terms. )
## 
##           -- Classic Algorithm -- 
## 
##       the algorithm is scoring 871 nontrivial nodes
##       parameters: 
##           test statistic: fisher
## [1] "Starting analysis 3 of 5"
## 
## Building most specific GOs .....
##  ( 15540 GO terms found. )
## 
## Build GO DAG topology ..........
##  ( 15584 GO terms and 36018 relations. )
## 
## Annotating nodes ...............
##  ( 15464 genes annotated to the GO terms. )
## 
##           -- Classic Algorithm -- 
## 
##       the algorithm is scoring 1204 nontrivial nodes
##       parameters: 
##           test statistic: fisher
## [1] "Starting analysis 4 of 5"
## 
## Building most specific GOs .....
##  ( 15540 GO terms found. )
## 
## Build GO DAG topology ..........
##  ( 15584 GO terms and 36018 relations. )
## 
## Annotating nodes ...............
##  ( 15464 genes annotated to the GO terms. )
## 
##           -- Classic Algorithm -- 
## 
##       the algorithm is scoring 645 nontrivial nodes
##       parameters: 
##           test statistic: fisher
## [1] "Starting analysis 5 of 5"
## 
## Building most specific GOs .....
##  ( 15540 GO terms found. )
## 
## Build GO DAG topology ..........
##  ( 15584 GO terms and 36018 relations. )
## 
## Annotating nodes ...............
##  ( 15464 genes annotated to the GO terms. )
## 
##           -- Classic Algorithm -- 
## 
##       the algorithm is scoring 1381 nontrivial nodes
##       parameters: 
##           test statistic: fisher
pf_bf_js <- 
  Heatmap(
    cors$js,
    cluster_rows = F,
    cluster_columns = F, 
    show_row_names = TRUE, 
    name = "JSD", 
    col = brewer.pal(10,"RdBu"),
    left_annotation = devstages_ha_rows(),
    top_annotation = quick_ha(colnames(cors$js), rev = TRUE)
    )
pdf(
  file = "graphics/pfla_bflo_jsd.pdf",
  height = 4.5,
  width = 4.5
)
draw(pf_bf_js)
dev.off()
## png 
##   2
draw(pf_bf_js)

plot_grid(
  ab_common_genes_cor$hicor_topgenes[[1]]$plot_topgenes,
  ab_common_genes_cor_GOs$GOplot[[1]],
  ab_common_genes_cor$hicor_topgenes[[2]]$plot_topgenes,
  ab_common_genes_cor_GOs$GOplot[[2]],
  ab_common_genes_cor$hicor_topgenes[[3]]$plot_topgenes,
  ab_common_genes_cor_GOs$GOplot[[3]],
  ab_common_genes_cor$hicor_topgenes[[4]]$plot_topgenes,
  ab_common_genes_cor_GOs$GOplot[[4]],
  ab_common_genes_cor$hicor_topgenes[[5]]$plot_topgenes,
  ab_common_genes_cor_GOs$GOplot[[5]],
  ncol = 2
  )

pdf(
  file = "graphics/pfla_bflo_hi_cor_genes.pdf",
  width = 8,
  height = 25
)
plot_grid(
  ab_common_genes_cor$hicor_topgenes[[1]]$plot_topgenes,
  ab_common_genes_cor_GOs$GOplot[[1]],
  ab_common_genes_cor$hicor_topgenes[[2]]$plot_topgenes,
  ab_common_genes_cor_GOs$GOplot[[2]],
  ab_common_genes_cor$hicor_topgenes[[3]]$plot_topgenes,
  ab_common_genes_cor_GOs$GOplot[[3]],
  ab_common_genes_cor$hicor_topgenes[[4]]$plot_topgenes,
  ab_common_genes_cor_GOs$GOplot[[4]],
  ab_common_genes_cor$hicor_topgenes[[5]]$plot_topgenes,
  ab_common_genes_cor_GOs$GOplot[[5]],
  ncol = 2
  )
dev.off()
## png 
##   2

Using OMA

We will follow by loading all the gene family data that we need. This is the gene/gfam lookup table (extracted from the OrthoXML output file from OMA) and the respective table to transform the gene indexes into the actual gene id of every species.

gene_gfam <- read.delim2("outputs/comparative/20240404_oma/oma_omaid_gfam.tsv", header = TRUE)
colnames(gene_gfam) <- c("id","gfam")
gene_gfam$id <- gsub("\\.p[0-9]+","",gene_gfam$id)
oma_pfla_bflo <- read.delim2("outputs/comparative/20240404_oma/one_to_one_orthologues/1to1_pfla_bflo.tsv", header = FALSE)
oma_pfla_bflo$V2 <- gsub(" ","",oma_pfla_bflo$V2)
load("outputs/rda/geneage.rda")

For more functional annotation, we will use the COG functional categories and the GO terms of Ptychodera.

pfla_cogs <- read.table(
  "outputs/functional_annotation/COGs/pfla_cogs.tsv",
  col.names = c("id","cog")
)

# GO terms
pfla_id2go <- 
  readMappings(
    "outputs/functional_annotation/go_blast2go/GO_annotation.txt"
  )

We will first compare the transcriptomes and stage-specific clusters of Ptychodera and Amphioxus. For this, we will define a number of objects that will enter into our custom wrapper function comparABle. The most important of them being:

  • Objects a and b are the expression profile datasets (gene x condition/stage/etc) of species A and B.
  • Object o is a 1-to-1 orthologue association file between genes of species A and B (e.g. BLAST reciprocal best hits, OMA 1:1 orthologues, …)
  • Object f is a “gene family”/orthogroup file associating genes from species A and B to “gene families”
  • Objects ma, mb are a gene classification system. For each species a and b, a file associating genes to gene moules (e.g. k-means clusters, hierarchical clusters, WGCNA modules, etc.)
  • Object g is a gene age file for the gene families, and the genes, of each species.

In addition, we will also pass it a list of gene ontologies, a COG functional category association file, and a list of gene ages that are shared between species A and B.

# Expression data
a = pfla_rna_counts
b = bflo_counts

# Samples for rowmeans_by repl
a_samples = levels(condition_x)
b_samples = unique(sub("_.$", "", colnames(b)))

# Family/orthology data
o = unique(oma_pfla_bflo)
f = gene_gfam[grep("TCONS|bflo",gene_gfam$id),]

# Module/Cluster information
ma = data.frame(
  id = rownames(pfla_rna_dev),
  module = pfla_rna_dev$cID
)

mb = bflo_cl
colnames(mb) <- c("id","module")

# Gene Age
ga = pfla_age[,c(1,3)]
colnames(ga) = c("id","age")

# COG
cog_a <- pfla_cogs

# GOs
a_universe = rownames(vsd_allgen)
a_id2go = pfla_id2go

# Common Evo Nodes
common_evo_nodes = unique(ga$age)[!(unique(ga$age) %in% c("6_ambulacraria","7_hemichordata","8_Pfla"))]

Below we will run the comparABle wrapper. This can take some time.

PFLA_BFLO_COMPARISON <- comparABle(
  a_name = "P.flava",
  b_name = "B.floridae",
  a = a,
  b = b,
  o = o,
  f = f,
  ma = ma,
  mb = mb,
  ga = ga,
  gb = gb,
  cog_a = cog_a,
  cog_b = cog_b,
  a_samples = a_samples,
  b_samples = b_samples,
  highlyvariable = FALSE,
  cooc_p = 0.05,
  cooc_h = c(0.70,0.95),
  cooc_cor_method = "pearson",
  a_universe = a_universe,
  a_id2go = a_id2go,
  common_evo_nodes = common_evo_nodes,
  sep = ",\ "
)
## [1] "Tidy up data"
## [1] "Merge data"
## [1] "Correlations"
## [1] "PCA"
## [1] "Co-Occurrence"
## [1] "Common genes in Correlations"
## The top five similar pair of stages are: 18.0721 18.47827 18.58822 18.62579 18.71676 
## [1] "Subsetting count matrices"
## [1] "Model fitting and top genes"
## [1] "Common genes in Correlations (GO)"
## [1] "Starting analysis 1 of 5"
## [1] "Starting analysis 2 of 5"
## [1] "Starting analysis 3 of 5"
## [1] "Starting analysis 4 of 5"
## [1] "Starting analysis 5 of 5"
## [1] "Common genes in Correlations (age)"
## [1] "Pairwise Orthology Overlap Strategy across modules -- hypergeometric and binonmial tests"
## [1] "Getting info on genes from shared families across modules"
## [1] "Starting analysis 1 of 20"
## [1] "Starting analysis 2 of 20"
## [1] "Starting analysis 3 of 20"
## [1] "Starting analysis 4 of 20"
## [1] "Starting analysis 5 of 20"
## [1] "Starting analysis 6 of 20"
## [1] "Starting analysis 7 of 20"
## [1] "Starting analysis 8 of 20"
## [1] "Starting analysis 9 of 20"
## [1] "Starting analysis 10 of 20"
## [1] "Starting analysis 11 of 20"
## [1] "Starting analysis 12 of 20"
## [1] "Starting analysis 13 of 20"
## [1] "Starting analysis 14 of 20"
## [1] "Starting analysis 15 of 20"
## [1] "Starting analysis 16 of 20"
## [1] "Starting analysis 17 of 20"
## [1] "Starting analysis 18 of 20"
## [1] "Starting analysis 19 of 20"
## [1] "Starting analysis 20 of 20"
## [1] "Plots"
## [1] "Generating results"

plot_cors(PFLA_BFLO_COMPARISON$pairwise_correlations)

# Common genes, highly correlated
p1 <- plot_hicor_genes(
  scatter = PFLA_BFLO_COMPARISON$high_corr_genes$pairwise_data$hicor_topgenes$cor_08_To__09_L2$plot_topgenes,
  GO_plot = PFLA_BFLO_COMPARISON$high_corr_genes$GOs$GOplot$cor_08_To__09_L2,
  age_table = PFLA_BFLO_COMPARISON$high_corr_genes$age$cor_08_To__09_L2$AgeperModule,
  age_enrichemnt_hm = PFLA_BFLO_COMPARISON$high_corr_genes$age$cor_08_To__09_L2$heatmap
)
p2 <- plot_hicor_genes(
  scatter = PFLA_BFLO_COMPARISON$high_corr_genes$pairwise_data$hicor_topgenes$cor_07_LG__08_L1$plot_topgenes,
  GO_plot = PFLA_BFLO_COMPARISON$high_corr_genes$GOs$GOplot$cor_07_LG__08_L1,
  age_table = PFLA_BFLO_COMPARISON$high_corr_genes$age$cor_07_LG__08_L1$AgeperModule,
  age_enrichemnt_hm = PFLA_BFLO_COMPARISON$high_corr_genes$age$cor_07_LG__08_L1$heatmap
)
p3 <- plot_hicor_genes(
  scatter = PFLA_BFLO_COMPARISON$high_corr_genes$pairwise_data$hicor_topgenes$cor_08_To__08_L1$plot_topgenes,
  GO_plot = PFLA_BFLO_COMPARISON$high_corr_genes$GOs$GOplot$cor_08_To__08_L1,
  age_table = PFLA_BFLO_COMPARISON$high_corr_genes$age$cor_08_To__08_L1$AgeperModule,
  age_enrichemnt_hm = PFLA_BFLO_COMPARISON$high_corr_genes$age$cor_08_To__08_L1$heatmap
)

plot_grid(p1,p2,p3,ncol = 1)

Heatmap(
  name = "co-occurrence",
  PFLA_BFLO_COMPARISON$coocurrence_analysis$cooccurrence[1:16,17:28],
  cluster_rows = F,
  cluster_columns = F,
  col = sequential_hcl(10,"YlOrRd", rev = TRUE)
)

PFLA_BFLO_COMPARISON$plots$orthology_overlap_binomial_hm+
PFLA_BFLO_COMPARISON$plots$orthology_overlap_hypgeom_hm

And here the combination of gene age and functional categories enrichment:

PFLA_BFLO_COMPARISON$orthology_overlap_modules$
  genes_in_common_fams$commonfams$
    age_a_common$heatmap +
PFLA_BFLO_COMPARISON$orthology_overlap_modules$
  genes_in_common_fams$commonfams$
    cog_a_comon$heatmap

save(
  PFLA_BFLO_COMPARISON,
  file = "outputs/rda/species_comparison_pfla_bflo.rda"
  )